package com.jiangxue.service;

/**
 * Created by Administrator on 2015/11/18.
 */
import java.io.*;
import java.util.ArrayList;
import java.util.List;

import com.jiangxue.common.entity.Document;
import net.paoding.analysis.analyzer.PaodingAnalyzer;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.poi.hwpf.extractor.WordExtractor;

public class Fenci {
    public static void main(String[] args) throws Exception {
        String path = "D:\\DOC";
        File docPath = new File(path);
        int filesLength = docPath.listFiles().length; //路径下文档数量
        List<Document> docList = new ArrayList<Document>(filesLength);
        Analyzer analyzer = new PaodingAnalyzer();
        String docText = null;

        for(int i=1; i<=filesLength; i++){
            File file = new File(path+"\\Doc"+i +".doc");
            docText = readText(file);
            StringReader reader = new StringReader(docText);
            TokenStream ts = analyzer.tokenStream(docText, reader);
//            Token t = ts.next();
//            while (t != null) {
//                Document document = new Document();//xiedaozhele
//                System.out.print(t.termText()+"     ");
//                t = ts.next();
//            }
        }
    }
    //读doc文件
    private static String readText(File file) throws Exception {
        InputStream is = new FileInputStream(file);
        WordExtractor extractor = new WordExtractor(is);
        String text = extractor.getText();
        return text;
    }
}